{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "82f90261",
   "metadata": {},
   "source": [
    "# Knowledge Graph Index\n",
    "\n",
    "This tutorial gives a basic overview of how to use our `KnowledgeGraphIndex`, which handles\n",
    "automated knowledge graph construction from unstructured text as well as entity-based querying.\n",
    "\n",
    "If you would like to query knowledge graphs in more flexible ways, including pre-existing ones, please\n",
    "check out our `KnowledgeGraphQueryEngine` and other constructs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# My OpenAI Key\n",
    "import os\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"INSERT OPENAI KEY\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88a9f2e3-c729-455a-a338-2f83776c1d4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import sys\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "be3f7baa-1c0a-430b-981b-83ddca9e71f2",
   "metadata": {},
   "source": [
    "## Using Knowledge Graph"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "75f1d565-04e8-41bc-9165-166dc89b6b47",
   "metadata": {},
   "source": [
    "#### Building the Knowledge Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n"
     ]
    }
   ],
   "source": [
    "from llama_index import (\n",
    "    SimpleDirectoryReader,\n",
    "    ServiceContext,\n",
    "    KnowledgeGraphIndex,\n",
    ")\n",
    "from llama_index.graph_stores import SimpleGraphStore\n",
    "\n",
    "from llama_index.llms import OpenAI\n",
    "from IPython.display import Markdown, display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c297fd3-3424-41d8-9d0d-25fe6310ab62",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = SimpleDirectoryReader(\n",
    "    \"../../../../examples/paul_graham_essay/data\"\n",
    ").load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61679142-7595-492b-8792-26cbc439caf8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define LLM\n",
    "# NOTE: at the time of demo, text-davinci-002 did not have rate-limit errors\n",
    "\n",
    "llm = OpenAI(temperature=0, model=\"text-davinci-002\")\n",
    "service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "370fd08f-56ff-4c24-b0c4-c93116a6d482",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "from llama_index.storage.storage_context import StorageContext\n",
    "\n",
    "graph_store = SimpleGraphStore()\n",
    "storage_context = StorageContext.from_defaults(graph_store=graph_store)\n",
    "\n",
    "# NOTE: can take a while!\n",
    "index = KnowledgeGraphIndex.from_documents(\n",
    "    documents,\n",
    "    max_triplets_per_chunk=2,\n",
    "    storage_context=storage_context,\n",
    "    service_context=service_context,\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "8b8f26e8-785c-4f39-87e4-31c6719ef5cf",
   "metadata": {},
   "source": [
    "#### [Optional] Try building the graph and manually add triplets!"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "c39a0eeb-ef16-4982-8ba8-b37c2c5f4437",
   "metadata": {},
   "source": [
    "#### Querying the Knowledge Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "670300d8-d0a8-4201-bbcd-4a74b199fcdd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Starting query: Tell me more about Interleaf\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Query keywords: ['Interleaf', 'company', 'software', 'history']\n",
      "ERROR:llama_index.indices.knowledge_graph.retrievers:Index was not constructed with embeddings, skipping embedding usage...\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 116 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 116 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "query_engine = index.as_query_engine(\n",
    "    include_text=False, response_mode=\"tree_summarize\"\n",
    ")\n",
    "response = query_engine.query(\n",
    "    \"Tell me more about Interleaf\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eecf2d57-3efa-4b0d-941a-95438d42893c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>\n",
       "Interleaf was a software company that developed and published document preparation and desktop publishing software. It was founded in 1986 and was headquartered in Waltham, Massachusetts. The company was acquired by Quark, Inc. in 2000.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd14686d-1c53-4637-9340-3745f2121ae2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Starting query: Tell me more about what the author worked on at Interleaf\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Query keywords: ['author', 'Interleaf', 'work']\n",
      "ERROR:llama_index.indices.knowledge_graph.retrievers:Index was not constructed with embeddings, skipping embedding usage...\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 104 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 104 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "query_engine = index.as_query_engine(\n",
    "    include_text=True, response_mode=\"tree_summarize\"\n",
    ")\n",
    "response = query_engine.query(\n",
    "    \"Tell me more about what the author worked on at Interleaf\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4c87d14-d2d8-4d80-89f6-1e5972973528",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>\n",
       "The author worked on a number of projects at Interleaf, including the development of the company's flagship product, the Interleaf Publisher.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ecc7342a",
   "metadata": {},
   "source": [
    "#### Query with embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b20f9da1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "# NOTE: can take a while!\n",
    "new_index = KnowledgeGraphIndex.from_documents(\n",
    "    documents,\n",
    "    max_triplets_per_chunk=2,\n",
    "    service_context=service_context,\n",
    "    include_embeddings=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01b74b2a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Starting query: Tell me more about what the author worked on at Interleaf\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Query keywords: ['author', 'Interleaf', 'work']\n",
      "ERROR:llama_index.indices.knowledge_graph.retrievers:Index was not constructed with embeddings, skipping embedding usage...\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 104 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 104 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "# query using top 3 triplets plus keywords (duplicate triplets are removed)\n",
    "query_engine = index.as_query_engine(\n",
    "    include_text=True,\n",
    "    response_mode=\"tree_summarize\",\n",
    "    embedding_mode=\"hybrid\",\n",
    "    similarity_top_k=5,\n",
    ")\n",
    "response = query_engine.query(\n",
    "    \"Tell me more about what the author worked on at Interleaf\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02084f6d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "<b>\n",
       "The author worked on a number of projects at Interleaf, including the development of the company's flagship product, the Interleaf Publisher.</b>"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "cd582500-584c-409a-9963-921738f1beb8",
   "metadata": {},
   "source": [
    "#### Visualizing the Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9fe3d26-4f9a-4651-b83f-0018672a34e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "example.html\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"100%\"\n",
       "            height=\"600px\"\n",
       "            src=\"example.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "            \n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x126fc43a0>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## create graph\n",
    "from pyvis.network import Network\n",
    "\n",
    "g = index.get_networkx_graph()\n",
    "net = Network(notebook=True, cdn_resources=\"in_line\", directed=True)\n",
    "net.from_nx(g)\n",
    "net.show(\"example.html\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "40b97044-d212-4151-bd72-6ea2cff35a29",
   "metadata": {},
   "source": [
    "#### [Optional] Try building the graph and manually add triplets!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9de2ddb-4e82-438b-ba3a-b7680efed944",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.node_parser import SentenceSplitter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "137176d9-1bc2-4203-8379-7b285cd41546",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_parser = SentenceSplitter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc609c08-6fce-444c-84cd-a305fcad6bcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "nodes = node_parser.get_nodes_from_documents(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21c3ad61-6f2a-4176-96ba-6e9f52d6243d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "# initialize an empty index for now\n",
    "index = KnowledgeGraphIndex(\n",
    "    [],\n",
    "    service_context=service_context,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41e03f7e-bb98-4fe0-9fc0-369be2864a00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add keyword mappings and nodes manually\n",
    "# add triplets (subject, relationship, object)\n",
    "\n",
    "# for node 0\n",
    "node_0_tups = [\n",
    "    (\"author\", \"worked on\", \"writing\"),\n",
    "    (\"author\", \"worked on\", \"programming\"),\n",
    "]\n",
    "for tup in node_0_tups:\n",
    "    index.upsert_triplet_and_node(tup, nodes[0])\n",
    "\n",
    "# for node 1\n",
    "node_1_tups = [\n",
    "    (\"Interleaf\", \"made software for\", \"creating documents\"),\n",
    "    (\"Interleaf\", \"added\", \"scripting language\"),\n",
    "    (\"software\", \"generate\", \"web sites\"),\n",
    "]\n",
    "for tup in node_1_tups:\n",
    "    index.upsert_triplet_and_node(tup, nodes[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48b1a666-2f84-4524-851a-66efd2beb611",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Starting query: Tell me more about Interleaf\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Query keywords: ['Interleaf', 'company', 'software', 'history']\n",
      "ERROR:llama_index.indices.knowledge_graph.retrievers:Index was not constructed with embeddings, skipping embedding usage...\n",
      "INFO:llama_index.indices.knowledge_graph.retrievers:> Extracted relationships: The following are knowledge triplets in max depth 2 in the form of `subject [predicate, object, predicate_next_hop, object_next_hop ...]`\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 116 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 116 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "query_engine = index.as_query_engine(\n",
    "    include_text=False, response_mode=\"tree_summarize\"\n",
    ")\n",
    "response = query_engine.query(\n",
    "    \"Tell me more about Interleaf\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb4b99d7-452f-4594-94e9-da10a3a23fb8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nInterleaf was a software company that developed and published document preparation and desktop publishing software. It was founded in 1986 and was headquartered in Waltham, Massachusetts. The company was acquired by Quark, Inc. in 2000.'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str(response)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
